Brian S. Evans, Ph.D.
Migratory Bird Center
Smithsonian Conservation Biology Institute
# Load libraries
library(RCurl)
library(tidyverse)# Read data:
url <- 'https://raw.githubusercontent.com/bsevansunc/workshop_languageOfR/master'
habitsURL <- getURL(
paste(url, 'birdHabits.csv', sep = '/')
)
countsURL <- getURL(
paste(url, 'birdCounts.csv', sep = '/')
)# Read in the data:
birdHabits <- tbl_df(
read.csv(text = habitsURL, stringsAsFactors = FALSE)
)
birdCounts <- tbl_df(
read.csv(text = countsURL, stringsAsFactors = FALSE)
)For the iris dataset, I think it’s best to do some familiar cleaning steps:
# Clean up iris for analysis:
irisTbl <- tbl_df(iris)
names(irisTbl) <-
c('sepalLength',
'sepalWidth',
'petalLength',
'petalWidth',
'species')Functions are a type of R object that consists of commands that can be used to execute complex or repetitive tasks.
Functions take the form:
functionName <- function(functionTarget) {
functionBody
}# First function:
addOneFun <- function(x){
x+1
}# Testing the function on a numeric value:
42+1
addOneFun(42)# First function:
addOneFun <- function(x){
x+1
}# Testing the function on a vector of numeric values:
v <- c(1,1,2,3,5)
v + 1
Functions can simplify writing queries!
# Explore birdCounts data:
str(birdCounts)
head(birdCounts)
# Matrix notation query:
birdCounts[birdCounts$species == 'grca', ]
Functions can simplify writing queries!
# Query by species function:
speciesSubset <- function(spp){
birdCounts[birdCounts$species == spp, ]
}# Test function:
birdCounts[birdCounts$species == 'grca', ]
speciesSubset('grca')# Query by species function, generalized:
speciesSubset <- function(dfIn, spp){
dfIn[dfIn$species == spp, ]
}# Test function, birdCounts:
birdCounts[birdCounts$species == 'grca', ]
speciesSubset(birdCounts, 'grca')
# Test function, birdHabits:
birdHabits[birdHabits$species == 'grca', ]
speciesSubset(birdHabits, 'grca')In many, but not all situations.
# Subset to catbirds using $ and matrix notation:
birdHabits[birdHabits$species == 'grca', ]
birdHabits[birdHabits[,'species'] == 'grca',]# Very generalized query:
query <- function(dfIn, variable, condition){
dfIn[dfIn[,variable] == condition,]
}
# Test query:
birdHabits[birdHabits$species == 'grca', ]
birdHabits[birdHabits[,'species'] == 'grca',]
query(birdHabits, 'species', 'grca')birdHabits data frame to just ground foraging birds.
birdCounts data frame(i.e., the sum of count for a data frame subset by species)
birdHabits data frame to just ground foraging birds.
head(birdHabits)
birdHabits[birdHabits$foraging == 'ground',]
foragingSubset <- function(foragingValue){
birdHabits[birdHabits$foraging == foragingValue,]
}
foragingSubsetGeneral <- function(dfIn, foragingValue){
dfIn[dfIn$foraging == foragingValue,]
}
foragingSubset('ground')
foragingSubsetGeneral(birdHabits, 'ground')birdCounts data frame(i.e., the sum of count for a data frame subset by species)
head(birdCounts)
birdCounts[birdCounts$species == 'grca',]
birdCounts[birdCounts$species == 'grca',]$count
sum(birdCounts[birdCounts$species == 'grca',]$count)
speciesN <- function(dfIn, spp){
sum(dfIn[dfIn$species == spp, ]$count)
}
speciesN(birdCounts, 'grca')# Query function, mean count:
meanSpeciesCounts <- function(spp){
# Number of unique site values:
nSites <- length(unique(birdCounts$site))
# Subset birdCounts to the species of interest:
birdCounts_sppSubset <- birdCounts[birdCounts$species == spp, ]
# Calculate the total number of birds observed:
nBirds <- sum(birdCounts_sppSubset$count)
# Return mean number of birds per site:
return(nBirds/nSites)
}
# What is the average number of observed catbirds?
meanSpeciesCounts('grca')# Query by species function, generalized:
meanSpeciesCounts <- function(dfIn, spp){
# Number of unique site values:
nSites <- length(unique(dfIn$site))
# Calculate the total number of birds observed:
nBirds <- sum(speciesSubset(dfIn, spp)$count)
# Return mean number of birds per site:
return(nBirds/nSites)
}
# What is the average number of observed catbirds?
meanSpeciesCounts(birdCounts, 'grca')birdHabits data frame, write a function to count the number of species in a given diet and foraging guild.
sd and the function for square root is sqrt):
birdHabits data frame, write a function to count the number of species in a given diet and foraging guild.
length(unique(birdHabits[birdHabits$diet == 'omnivore' &
birdHabits$foraging == 'ground',]$species))
dietForagingSppCount <- function(dietValue, foragingValue){
dietSubset <- birdHabits[birdHabits$diet == dietValue,]
foragingSubset <- dietSubset[dietSubset$foraging == foragingValue,]
sppNames <- foragingSubset$species
return(length(unique(sppNames)))
}
dietForagingSppCount('omnivore', 'ground')sd and the function for square root is sqrt):
se <- function(x){
sd(x)/sqrt(length(x))
}Why would you use for loops?
# Filter irisTbl to setosa:
irisTbl[irisTbl$species == 'setosa', ]
# Extract the petalLength field (column):
irisTbl[irisTbl$species == 'setosa', ]$petalLength
# Calculate the mean of petal lengths:
mean(irisTbl[irisTbl$species == 'setosa', ]$petalLength)Calculate the mean petal length of each of the Iris species using matrix notation (as above) and a custom function.
Calculate the mean petal length of each of the Iris species using matrix notation (as above) and a custom function.
# Mean petal lengths, matrix notation:
mean(irisTbl[irisTbl$species == 'setosa', ]$petalLength)
mean(irisTbl[irisTbl$species == 'versicolor', ]$petalLength)
mean(irisTbl[irisTbl$species == 'virginica', ]$petalLength)
# Mean petal lengths, function method:
meanPetalFun <- function(spp){
mean(irisTbl[irisTbl$species == spp, ]$petalLength)
}
meanPetalFun('setosa')
meanPetalFun('versicolor')
meanPetalFun('virginica')
Construct a vector, v using a set of five numbers.
# Generate vector v:
v <- c(1,1,2,3,5)
v
Modify the values in vector v by adding one to each value. This might be written mathematically as:
Writing proper for loops requires following these three steps:
Recall that value v[i] is equal to the value at position i in vector v. Let’s take a look at the value of v at position 3:
# Explore vector v using indexing:
i <- 3
v[i]
v[3]
v[3] == v[i]
Recall that value v[i] is equal to the value at position i in vector v. Let’s take a look at the value of v at position 3:
# Add 1 to the value of v at position three:
v[3] + 1
v[i] + 1ALWAYS specify an object to store your output!
Vector objects are defined as:
# Define a vector for output:
vNew <- vector('numeric', length = length(v))
str(vNew)ALWAYS specify an object to store your output!
# Explore filling values of vNew by index:
i <- 3
v[i]
vNew[i] <- v[i] + 1
vNew[i]
v[i] + 1 == vNew[i]The sequence can be defined as:
v
1:5
1:length(v)
seq_along(v)
# Example for loop sequence statements:
# for(i in 1:length(v))
# for(i in seq_along(v))The for loop body describes what will happen at each iteration of the loop. For example:
i <- 3
vNew[i] <- v[i] + 1# First for loop:
vNew <- numeric(length = length(v))
for(i in seq_along(v)){
vNew[i] <- v[i] + 1
}
# Explore first for loop output:
vNew
v
vNew == vSplit-Apply-Combine
# Mean petal lengths of Iris species without a for loop:
mean(irisTbl[irisTbl$species == 'setosa', ]$petalLength)
mean(irisTbl[irisTbl$species == 'versicolor', ]$petalLength)
mean(irisTbl[irisTbl$species == 'virginica', ]$petalLength)Split-Apply-Combine
Start by creating a vector of species:
# Make a vector of species to loop across:
irisSpecies <- levels(irisTbl$species)
irisSpeciesSplit-Apply-Combine
Create an empty vector to store our output:
# For loop output statement:
petalLengths <- vector('numeric',length = length(irisSpecies))
petalLengthsSplit-Apply-Combine
Split: The for loop body, starts with splitting the data
# Exploring the iris data, subsetting by species:
i <- 3
irisSpecies[i]
irisTbl[irisTbl$species == irisSpecies[i], ]
# Split:
iris_sppSubset <- irisTbl[irisTbl$species == irisSpecies[i], ]Split-Apply-Combine
Apply: Modification of the data:
# Calculate mean petal length of each subset:
mean(iris_sppSubset$petalLength)Split-Apply-Combine
# Make a vector of species to loop across:
irisSpecies <- levels(irisTbl$species)
# For loop output statement:
petalLengths <- vector('numeric',length = length(irisSpecies))
# For loop:
for(i in seq_along(irisSpecies)){
# Split:
iris_sppSubset <- irisTbl[irisTbl$species == irisSpecies[i], ]
# Apply:
petalLengths[i] <- mean(iris_sppSubset$petalLength)
}Split-Apply-Combine
Combine: Combining the for loop output
# Make a tibble data frame of the for loop output:
petalLengthFrame <- data_frame(species = irisSpecies, count = petalLengths)
petalLengthFrame
Use a for loop and the birdHabits data frame to calculate the number species in each diet guild.
Use a for loop and the birdHabits data frame to calculate the number species in each diet guild.
birdHabits
diets <- unique(birdHabits$diet)
outVector <- vector('numeric', length = length(diets))
for(i in seq_along(outVector)){
# Split:
dietSubset <- birdHabits[birdHabits$diet == diets[i],]
# Apply:
outVector[i] <- nrow(dietSubset)
}
# Combine:
data_frame(diet = diets, nSpecies = outVector)For loops can be used to explore data objects with common features.
How many omnivorous birds were observed at each site?
# Explore the bird count data:
head(birdCounts)
str(birdCounts)
# Explore the bird trait data:
head(birdHabits)
str(birdHabits)How many omnivorous birds were observed at each site?
Get a vector of birds that are ground foragers from the birdHabits data frame:
# Extract vector of omnivorous species:
omnivores <- birdHabits[birdHabits$diet == 'omnivore',]$speciesHow many omnivorous birds were observed at each site?
Split the data into individual sites.
# Generate a vector of unique sites:
sites <- unique(birdCounts$site)
# Site at position i:
i <- 3
sites[i]
# Subset data:
birdCounts_siteSubset <- birdCounts[birdCounts$site == sites[i],]
birdCounts_siteSubsetHow many omnivorous birds were observed at each site?
Split: Use %in% to extract only records associated with omnivores and sum the count field.
# Just a vector of omnivore counts:
countVector <-
birdCounts_siteSubset[birdCounts_siteSubset$species %in%
omnivores,]$countHow many omnivorous birds were observed at each site?
Apply: Sum the count vector.
# Get total number of omnivores at the site:
nOmnivores <- sum(countVector)How many omnivorous birds were observed at each site?
Combine: Values combined using the vector method
sites <- unique(birdCounts$site)
outVector <- vector('numeric', length = length(unique(sites)))
for(i in seq_along(sites)){
birdCounts_siteSubset <- birdCounts[birdCounts$site == sites[i],]
countVector <-
birdCounts_siteSubset[birdCounts_siteSubset$species %in%
omnivores, ]$count
outVector[i] <- sum(countVector)
}
# Combine:
data_frame(site = sites, nOmnivores = outVector)How many omnivorous birds were observed at each site?
Combine: Values combined using the list method
sites <- unique(birdCounts$site)
outList <- vector('list', length = length(unique(sites)))
for(i in seq_along(sites)){
birdCounts_siteSubset <- birdCounts[birdCounts$site == sites[i],]
countVector <-
birdCounts_siteSubset[birdCounts_siteSubset$species %in%
omnivores,]$count
outList[[i]] <- data_frame(
site = sites[i],
nOmnivores = sum(countVector))
}
# Combine:
bind_rows(outList)For loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
For loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
# For loop output:
n <- vector('numeric', length = 5)
n
# Set the seed value:
n[1] <- 10
nFor loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
# For loop sequence:
# for(i in 2:length(n))For loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
Body: For each iteration (example, position 2):
# Exploring the construction of the for loop body:
i <- 2
n[i]
n[i-1]
n[i] <- 2*n[i-1]
nFor loop to generate a vector of numbers based on some mathematical function. For example:
\[n_t = 2(n_{t-1})\]
# Output:
n <- vector('numeric', length = 5)
# Seed:
n[1] <- 10
# For loop:
for(i in 2:5){
n[i] = n*v[i-1]
}One of my favorite for loops was created by Leonardo Bonacci (Fibonacci). He created the first known population model, from which the famous Fibonacci number series was created. He described a population (N) of rabbits at time t as the sum of the population at the previous time step plus the time step before that:
\[N_t = N_{t-1} + N_{t-2}\]